home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
C/C++ Users Group Library 1996 July
/
C-C++ Users Group Library July 1996.iso
/
vol_300
/
365_02
/
regexp.c
< prev
next >
Wrap
C/C++ Source or Header
|
1992-04-04
|
20KB
|
935 lines
/* regexp.c */
/* This file contains the code that compiles regular expressions and executes
* them. It supports the same syntax and features as vi's regular expression
* code. Specifically, the meta characters are:
* ^ matches the beginning of a line
* $ matches the end of a line
* \< matches the beginning of a word
* \> matches the end of a word
* . matches any single character
* [] matches any character in a character class
* \( delimits the start of a subexpression
* \) delimits the end of a subexpression
* * repeats the preceding 0 or more times
* NOTE: You cannot follow a \) with a *.
*
* The physical structure of a compiled RE is as follows:
* - First, there is a one-byte value that says how many character classes
* are used in this regular expression
* - Next, each character class is stored as a bitmap that is 256 bits
* (32 bytes) long.
* - A mixture of literal characters and compiled meta characters follows.
* This begins with M_BEGIN(0) and ends with M_END(0). All meta chars
* are stored as a \n followed by a one-byte code, so they take up two
* bytes apiece. Literal characters take up one byte apiece. \n can't
* be used as a literal character.
*
* If NO_MAGIC is defined, then a different set of functions is used instead.
* That right, this file contains TWO versions of the code.
*/
#include <setjmp.h>
#include "config.h"
#include "ctype.h"
#include "vi.h"
#include "regexp.h"
static char *previous; /* the previous regexp, used when null regexp is given */
#ifndef NO_MAGIC
/* THE REAL REGEXP PACKAGE IS USED UNLESS "NO_MAGIC" IS DEFINED */
/* These are used to classify or recognize meta-characters */
#define META '\0'
#define BASE_META(m) ((m) - 256)
#define INT_META(c) ((c) + 256)
#define IS_META(m) ((m) >= 256)
#define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))
#define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9))
#define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9))
#define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_RANGE)
#define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m))
#define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s)
/* These are the internal codes used for each type of meta-character */
#define M_BEGLINE 256 /* internal code for ^ */
#define M_ENDLINE 257 /* internal code for $ */
#define M_BEGWORD 258 /* internal code for \< */
#define M_ENDWORD 259 /* internal code for \> */
#define M_ANY 260 /* internal code for . */
#define M_SPLAT 261 /* internal code for * */
#define M_PLUS 262 /* internal code for \+ */
#define M_QMARK 263 /* internal code for \? */
#define M_RANGE 264 /* internal code for \{ */
#define M_CLASS(n) (265+(n)) /* internal code for [] */
#define M_START(n) (275+(n)) /* internal code for \( */
#define M_END(n) (285+(n)) /* internal code for \) */
/* These are used during compilation */
static int class_cnt; /* used to assign class IDs */
static int start_cnt; /* used to assign start IDs */
static int end_stk[NSUBEXP];/* used to assign end IDs */
static int end_sp;
static char *retext; /* points to the text being compiled */
/* error-handling stuff */
jmp_buf errorhandler;
#define FAIL(why) regerror(why); longjmp(errorhandler, 1)
/* This function builds a bitmap for a particular class */
static char *makeclass(text, bmap)
REG char *text; /* start of the class */
REG char *bmap; /* the bitmap */
{
REG int i;
int complement = 0;
/* zero the bitmap */
for (i = 0; bmap && i < 32; i++)
{
bmap[i] = 0;
}
/* see if we're going to complement this class */
if (*text == '^')
{
text++;
complement = 1;
}
/* add in the characters */
while (*text && *text != ']')
{
/* is this a span of characters? */
if (text[1] == '-' && text[2])
{
/* spans can't be backwards */
if (text[0] > text[2])
{
FAIL("Backwards span in []");
}
/* add each character in the span to the bitmap */
for (i = text[0]; bmap && i <= text[2]; i++)
{
bmap[i >> 3] |= (1 << (i & 7));
}
/* move past this span */
text += 3;
}
else
{
/* add this single character to the span */
i = *text++;
if (bmap)
{
bmap[i >> 3] |= (1 << (i & 7));
}
}
}
/* make sure the closing ] is missing */
if (*text++ != ']')
{
FAIL("] missing");
}
/* if we're supposed to complement this class, then do so */
if (complement && bmap)
{
for (i = 0; i < 32; i++)
{
bmap[i] = ~bmap[i];
}
}
return text;
}
/* This function gets the next character or meta character from a string.
* The pointer is incremented by 1, or by 2 for \-quoted characters. For [],
* a bitmap is generated via makeclass() (if re is given), and the
* character-class text is skipped.
*/
static int gettoken(sptr, re)
char **sptr;
regexp *re;
{
int c;
c = **sptr;
++*sptr;
if (c == '\\')
{
c = **sptr;
++*sptr;
switch (c)
{
case '<':
return M_BEGWORD;
case '>':
return M_ENDWORD;
case '(':
if (start_cnt >= NSUBEXP)
{
FAIL("Too many \\(s");
}
end_stk[end_sp++] = start_cnt;
return M_START(start_cnt++);
case ')':
if (end_sp <= 0)
{
FAIL("Mismatched \\)");
}
return M_END(end_stk[--end_sp]);
case '*':
return (*o_magic ? c : M_SPLAT);
case '.':
return (*o_magic ? c : M_ANY);
case '+':
return M_PLUS;
case '?':
return M_QMARK;
#ifndef CRUNCH
case '{':
return M_RANGE;
#endif
default:
return c;
}
}
else if (*o_magic)
{
switch (c)
{
case '^':
if (*sptr == retext + 1)
{
return M_BEGLINE;
}
return c;
case '$':
if (!**sptr)
{
return M_ENDLINE;
}
return c;
case '.':
return M_ANY;
case '*':
return M_SPLAT;
case '[':
/* make sure we don't have too many classes */
if (class_cnt >= 10)
{
FAIL("Too many []s");
}
/* process the character list for this class */
if (re)
{
/* generate the bitmap for this class */
*sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt);
}
else
{
/* skip to end of the class */
*sptr = makeclass(*sptr, (char *)0);
}
return M_CLASS(class_cnt++);
default:
return c;
}
}
else /* unquoted nomagic */
{
switch (c)
{
case '^':
if (*sptr == retext + 1)
{
return M_BEGLINE;
}
return c;
case '$':
if (!**sptr)
{
return M_ENDLINE;
}
return c;
default:
return c;
}
}
/*NOTREACHED*/
}
/* This function calculates the number of bytes that will be needed for a
* compiled RE. Its argument is the uncompiled version. It is not clever
* about catching syntax errors; that is done in a later pass.
*/
static unsigned calcsize(text)
char *text;
{
unsigned size;
int token;
retext = text;
class_cnt = 0;
start_cnt = 1;
end_sp = 0;
size = 5;
while ((token = gettoken(&text, (regexp *)0)) != 0)
{
if (IS_CLASS(token))
{
size += 34;
}
#ifndef CRUNCH
else if (token == M_RANGE)
{
size += 4;
while ((token = gettoken(&text, (regexp *)0)) != 0
&& token != '}')
{
}
if (!token)
{
return size;
}
}
#endif
else if (IS_META(token))
{
size += 2;
}
else
{
size++;
}
}
return size;
}
/* This function compiles a regexp. */
regexp *regcomp(exp)
char *exp;
{
int needfirst;
unsigned size;
int token;
int peek;
char *build;
regexp *re;
#ifndef CRUNCH
int from;
int to;
int digit;
#endif
/* prepare for error handling */
re = (regexp *)0;
if (setjmp(errorhandler))
{
if (re)
{
free(re);
}
return (regexp *)0;
}
/* if an empty regexp string was given, use the previous one */
if (*exp == 0)
{
if (!previous)
{
FAIL("No previous RE");
}
exp = previous;
}
else /* non-empty regexp given, so remember it */
{